Online Job Posting Analysis

Import Library

In [1]:
import pandas as pd
import numpy as np
import nltk

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Import Dataset

In [2]:
df = pd.read_csv("data job posts.csv")
df=df[df['Title'].isnull()==False]
df.reset_index(inplace=True)

df.head(5)
Out[2]:
index jobpost date Title Company AnnouncementCode Term Eligibility Audience StartDate ... Salary ApplicationP OpeningDate Deadline Notes AboutC Attach Year Month IT
0 0 AMERIA Investment Consulting Company\nJOB TITL... Jan 5, 2004 Chief Financial Officer AMERIA Investment Consulting Company NaN NaN NaN NaN NaN ... NaN To apply for this position, please submit a\nc... NaN 26 January 2004 NaN NaN NaN 2004 1 False
1 1 International Research & Exchanges Board (IREX... Jan 7, 2004 Full-time Community Connections Intern (paid i... International Research & Exchanges Board (IREX) NaN NaN NaN NaN NaN ... NaN Please submit a cover letter and resume to:\nI... NaN 12 January 2004 NaN The International Research & Exchanges Board (... NaN 2004 1 False
2 2 Caucasus Environmental NGO Network (CENN)\nJOB... Jan 7, 2004 Country Coordinator Caucasus Environmental NGO Network (CENN) NaN NaN NaN NaN NaN ... NaN Please send resume or CV toursula.kazarian@...... NaN 20 January 2004\nSTART DATE: February 2004 NaN The Caucasus Environmental NGO Network is a\nn... NaN 2004 1 False
3 3 Manoff Group\nJOB TITLE: BCC Specialist\nPOSI... Jan 7, 2004 BCC Specialist Manoff Group NaN NaN NaN NaN NaN ... NaN Please send cover letter and resume to Amy\nPe... NaN 23 January 2004\nSTART DATE: Immediate NaN NaN NaN 2004 1 False
4 4 Yerevan Brandy Company\nJOB TITLE: Software D... Jan 10, 2004 Software Developer Yerevan Brandy Company NaN NaN NaN NaN NaN ... NaN Successful candidates should submit\n- CV; \n-... NaN 20 January 2004, 18:00 NaN NaN NaN 2004 1 True

5 rows × 25 columns

In [3]:
# Get the basic info of DataFrame and perform programmatic assessment 
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18973 entries, 0 to 18972
Data columns (total 25 columns):
index               18973 non-null int64
jobpost             18973 non-null object
date                18973 non-null object
Title               18973 non-null object
Company             18973 non-null object
AnnouncementCode    1206 non-null object
Term                7671 non-null object
Eligibility         4929 non-null object
Audience            640 non-null object
StartDate           9672 non-null object
Duration            10788 non-null object
Location            18948 non-null object
JobDescription      15090 non-null object
JobRequirment       16459 non-null object
RequiredQual        18496 non-null object
Salary              9614 non-null object
ApplicationP        18920 non-null object
OpeningDate         18275 non-null object
Deadline            18915 non-null object
Notes               2208 non-null object
AboutC              12460 non-null object
Attach              1548 non-null object
Year                18973 non-null int64
Month               18973 non-null int64
IT                  18973 non-null bool
dtypes: bool(1), int64(3), object(21)
memory usage: 3.5+ MB

Find the job natute

In [4]:
#Finad the job nature and store in variable x
X=df['Title']
In [5]:
len(X)
Out[5]:
18973

Creating the corpus

In [7]:
import re
corpus = []
for i in range(0, len(X)):
    review = re.sub(r'\W', ' ', str(X[i]))
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^[a-z]\s+', ' ', review)
    review = re.sub(r'\s+', ' ', review)
    corpus.append(review)  

Remove all punctuations from title

In [8]:
# Remove punctuations from all title
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(corpus))

data_words[0]
Out[8]:
['chief', 'financial', 'officer']

Define Stopwords, bigram model and lemmatization for title data

In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
In [10]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','na','Senior','new','branch','Junior','unit','department','Specialist','the','unit'])
In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
In [12]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
[['chief', 'financial', 'officer']]
In [13]:
data_lemmatized[0:10]
Out[13]:
[['chief', 'financial', 'officer'],
 ['community', 'connection', 'intern', 'pay', 'internship'],
 ['country', 'coordinator'],
 ['specialist'],
 ['software', 'developer'],
 [],
 ['chief', 'accountant', 'finance', 'assistant'],
 ['pay', 'part', 'full_time', 'programmatic', 'intern'],
 ['assistant', 'manage', 'director'],
 ['program', 'assistant']]
In [14]:
# few null value is also coming so we are going to remove null from list
list2 = [x for x in data_lemmatized if x != []]
In [15]:
for i in range(len(list2)):
    list2[i] = ' '.join(list2[i])
In [16]:
list2[0:5]
Out[16]:
['chief financial officer',
 'community connection intern pay internship',
 'country coordinator',
 'specialist',
 'software developer']

Now we will select 1000 title for word cloud

In [17]:
titlecount = {}
for data in list2:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in titlecount.keys():
            titlecount[word] = 1
        else:
            titlecount[word] += 1
In [18]:
len(titlecount)
Out[18]:
1832
In [19]:
import heapq
# Selecting best 100 features
freq_words = heapq.nlargest(1000,titlecount,key=titlecount.get)
In [20]:
%%capture
#gather features
text = " ".join(freq_words)
In [21]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
In [22]:
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))

# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

Job nature Change over the time

In [23]:
for i in range(len(data_lemmatized)):
    data_lemmatized[i] = ' '.join(data_lemmatized[i])
In [24]:
date_field=df['Year'].tolist()

len(date_field)
Out[24]:
18973
In [25]:
Job_year = pd.DataFrame(np.column_stack([data_lemmatized,date_field]), 
                               columns=['Job_title','Year'])
In [26]:
Job_year.head(5)
Out[26]:
Job_title Year
0 chief financial officer 2004
1 community connection intern pay internship 2004
2 country coordinator 2004
3 specialist 2004
4 software developer 2004
In [27]:
Job_year.dtypes
Out[27]:
Job_title    object
Year         object
dtype: object
In [28]:
#Converting year to numeric value 

Job_year['Year']=Job_year['Year'].astype('int')
In [29]:
Job_year.dtypes
Out[29]:
Job_title    object
Year          int32
dtype: object
In [30]:
Job_year.Year.value_counts()
Out[30]:
2012    2140
2015    2009
2013    2009
2014    1980
2008    1782
2011    1695
2007    1538
2010    1506
2009    1191
2005    1138
2006    1111
2004     874
Name: Year, dtype: int64
In [31]:
#We will devide year into 3 equidistant bins to find the job nature over the period

Job_year['Year_bins']=pd.cut(Job_year['Year'],3,labels=['Period1','Period2','Period3'])
In [32]:
Job_year.pivot_table(values='Year',index='Year_bins',aggfunc=['min','max','count'])
Out[32]:
min max count
Year Year Year
Year_bins
Period1 2004 2007 4661
Period2 2008 2011 6174
Period3 2012 2015 8138

Job nature change over the period (2004 to 2007)

In [33]:
X1=Job_year[Job_year['Year_bins']=='Period1'].iloc[:,0]
In [34]:
# Selecting top 500 jobs for the wordcloud

titlecount = {}
for data in X1:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in titlecount.keys():
            titlecount[word] = 1
        else:
            titlecount[word] += 1
In [35]:
# import heapq
freq_words = heapq.nlargest(500,titlecount,key=titlecount.get)
In [36]:
%%capture
#gather features
text = " ".join(freq_words)
In [37]:
# Wordcloud need to draw for nature of job
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

Job nature change over the period (2008 to 2011)

In [38]:
X2=Job_year[Job_year['Year_bins']=='Period2'].iloc[:,0]
In [39]:
# Selecting top 500 jobs for the wordcloud

titlecount = {}
for data in X2:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in titlecount.keys():
            titlecount[word] = 1
        else:
            titlecount[word] += 1
            
# import heapq
freq_words = heapq.nlargest(500,titlecount,key=titlecount.get)
In [40]:
%%capture
#gather features
text = " ".join(freq_words)
In [41]:
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

Job nature change over the period (2012 to 2015)

In [42]:
X3=Job_year[Job_year['Year_bins']=='Period3'].iloc[:,0]
In [43]:
# Selecting top 500 jobs for the wordcloud

titlecount = {}
for data in X3:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in titlecount.keys():
            titlecount[word] = 1
        else:
            titlecount[word] += 1
            
# import heapq
freq_words = heapq.nlargest(500,titlecount,key=titlecount.get)
In [44]:
%%capture
#gather features
text = " ".join(freq_words)
In [45]:
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

IT Job Classification

In [46]:
# Defining X
df['X1'] = df['Title'].str.cat(df['JobRequirment'], sep =" ").str.cat(df['RequiredQual'], sep =" ")
X=df['X1']

# Defining Y
di={False:0,True:1}
df['IT_y']=df['IT'].map(di)
y=df['IT_y']
In [47]:
# Creating the corpus
import re
corpus = []
for i in range(0, len(X)):
    review = re.sub(r'\W', ' ', str(X[i]))
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^[a-z]\s+', ' ', review)
    review = re.sub(r'\s+', ' ', review)
    corpus.append(review)  
In [48]:
corpus[0]
Out[48]:
'chief financial officer supervises financial management and administrative staff including assigning responsibilities reviewing employees work processes and products counseling employees giving performance evaluations and recommending disciplinary action serves as member of management team participating in both strategic and operational planning for the company directs and oversees the company financial management activities including establishing and monitoring internal controls managing cash and investments and managing the investment portfolio in collaboration with the investment team leader this includes but is not limited to evaluation of investment risk concentration risk fund deployment levels adequacy of loss and liquidity reserves assists investment team in development of proper documentation and internal systems directs and oversees the annual budgeting process including developing projections for financial planning and preparing budgets prepares external and internal financial management reports such as audited financial statements tax returns and reports for the board of directors and company staff develops implements and maintains efficient and effective accounting systems and controls to ensure compliance with national and international accounting standards and principles sufficiency of fund accounting and comprehensiveness of data for reporting and compliance requirements ensures contract compliance including interpreting and monitoring contracts with clients submitting required reports and monitoring covenants and other contract terms oversees the design implementation and maintenance of computer based information system oversees records retention both manual and computer based and file maintenance activities serves as company risk manager including evaluating loss exposure and obtaining insurance as appropriate manages other administrative operations such as facilities management payroll administration office operations and administrative support monitors corporate compliance with by laws and articles of incorporation regarding corporate registration and reporting of fundraising operations to perform this job successfully an individual must be able to perform each essential duty satisfactorily the requirements listed below are representative of the knowledge skill and or ability required knowledge of generally accepted accounting principles local accounting standards and legislation state reporting requirements pertaining to accounting principles and practices of financial management and budgeting principles and practices of financial systems design and analysis principles and practices of contract management records management and risk management principles and practices of management and supervision principles and practices of information systems management ability to apply sound fiscal and administrative practices to the company activities plan organize and supervise the work of subordinate employees including training them assigning and evaluating their work and providing job performance feedback critically analyze fiscal and administrative policies practices procedures and systems and recommend and implement changes as needed gather and synthesize financial information from variety of sources and present it to variety of audiences with differing financial management and analysis expertise prepare detailed comprehensive financial reports including explanatory text operate ibm compatible personal computer including word processing spreadsheet and database software applications operate specialized software applications that support the financial management and budgeting functions qualifications minimum of 5 7 years accounting corporate finance banking experience including role as cfo excellent finance and accounting technical skills coupled with demonstrated knowledge of all key financial functions in an consulting company context accounting finance control treasury reserving and reporting strong financial planning and analytical skills and experience and the ability to work closely with and support the ceo and other executives in strategic development and implementation excellent leadership management and supervisory track record of attracting selecting developing rewarding and retaining high caliber accounting and finance executive and teams who achieve business goals an undergraduate degree in finance business or other related discipline is required cpa cfa acca or other financial certification is highly preferred as is masters degree in business administration accounting or finance fluency in english armenian and russian with outstanding writing skills excellent analytical communication teamwork interpersonal skills need to be well organized and detail oriented as well as goal result driven and able to deal with complex issues '
In [49]:
from nltk.stem import PorterStemmer
#sentences = nltk.sent_tokenize(paragraph)
stemmer = PorterStemmer()

# Stemming
for i in range(len(corpus)):
    words = nltk.word_tokenize(corpus[i])
    words = [stemmer.stem(word) for word in words]
    corpus[i] = ' '.join(words)  
In [50]:
corpus[0]
Out[50]:
'chief financi offic supervis financi manag and administr staff includ assign respons review employe work process and product counsel employe give perform evalu and recommend disciplinari action serv as member of manag team particip in both strateg and oper plan for the compani direct and overse the compani financi manag activ includ establish and monitor intern control manag cash and invest and manag the invest portfolio in collabor with the invest team leader thi includ but is not limit to evalu of invest risk concentr risk fund deploy level adequaci of loss and liquid reserv assist invest team in develop of proper document and intern system direct and overse the annual budget process includ develop project for financi plan and prepar budget prepar extern and intern financi manag report such as audit financi statement tax return and report for the board of director and compani staff develop implement and maintain effici and effect account system and control to ensur complianc with nation and intern account standard and principl suffici of fund account and comprehens of data for report and complianc requir ensur contract complianc includ interpret and monitor contract with client submit requir report and monitor coven and other contract term overse the design implement and mainten of comput base inform system overse record retent both manual and comput base and file mainten activ serv as compani risk manag includ evalu loss exposur and obtain insur as appropri manag other administr oper such as facil manag payrol administr offic oper and administr support monitor corpor complianc with by law and articl of incorpor regard corpor registr and report of fundrais oper to perform thi job success an individu must be abl to perform each essenti duti satisfactorili the requir list below are repres of the knowledg skill and or abil requir knowledg of gener accept account principl local account standard and legisl state report requir pertain to account principl and practic of financi manag and budget principl and practic of financi system design and analysi principl and practic of contract manag record manag and risk manag principl and practic of manag and supervis principl and practic of inform system manag abil to appli sound fiscal and administr practic to the compani activ plan organ and supervis the work of subordin employe includ train them assign and evalu their work and provid job perform feedback critic analyz fiscal and administr polici practic procedur and system and recommend and implement chang as need gather and synthes financi inform from varieti of sourc and present it to varieti of audienc with differ financi manag and analysi expertis prepar detail comprehens financi report includ explanatori text oper ibm compat person comput includ word process spreadsheet and databas softwar applic oper special softwar applic that support the financi manag and budget function qualif minimum of 5 7 year account corpor financ bank experi includ role as cfo excel financ and account technic skill coupl with demonstr knowledg of all key financi function in an consult compani context account financ control treasuri reserv and report strong financi plan and analyt skill and experi and the abil to work close with and support the ceo and other execut in strateg develop and implement excel leadership manag and supervisori track record of attract select develop reward and retain high calib account and financ execut and team who achiev busi goal an undergradu degre in financ busi or other relat disciplin is requir cpa cfa acca or other financi certif is highli prefer as is master degre in busi administr account or financ fluenci in english armenian and russian with outstand write skill excel analyt commun teamwork interperson skill need to be well organ and detail orient as well as goal result driven and abl to deal with complex issu'
In [51]:
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 0.05, max_df = 0.8, stop_words = stopwords.words('english'))

X = vectorizer.fit_transform(corpus).toarray()
In [52]:
X.shape
Out[52]:
(18973, 331)

Building Logistic Regression Model.

In [53]:
from sklearn.model_selection import train_test_split

train_x,test_x,train_y,test_y=train_test_split(X,
                                              y,
                                              test_size=.3,
                                              random_state=42)
In [54]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit( train_x, train_y )
Out[54]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Evaluation of Model

In [55]:
#Predicting the test cases
from sklearn import metrics
test_accuracy=metrics.accuracy_score(test_y,logreg.predict(test_x))
print('test_accuracy: ',test_accuracy)

train_accuracy=metrics.accuracy_score(train_y,logreg.predict(train_x))
print('train_accuracy: ',train_accuracy)
test_accuracy:  0.9009135628952917
train_accuracy:  0.91092538212484
In [56]:
print('AUC train :',metrics.roc_auc_score(train_y,logreg.predict(train_x)))
print('AUC test :',metrics.roc_auc_score(test_y,logreg.predict(test_x)))
AUC train : 0.8128788197036333
AUC test : 0.8020340209126331
In [57]:
# Creating a confusion matrix

from sklearn import metrics

cm = metrics.confusion_matrix(test_y,
                            logreg.predict(test_x), [0,1] )
cm

import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

sn.heatmap(cm, annot=True,  fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[57]:
Text(0.5, 15.0, 'Predicted label')
In [58]:
from sklearn.metrics import classification_report
print(classification_report(test_y,logreg.predict(test_x)))
             precision    recall  f1-score   support

          0       0.91      0.97      0.94      4543
          1       0.83      0.64      0.72      1149

avg / total       0.90      0.90      0.90      5692

In [59]:
test_predicted_prob=pd.DataFrame(logreg.predict_proba(test_x))[[1]]
test_predicted_prob.columns=['prob']
actual=test_y.reset_index()
actual.drop('index',axis=1,inplace=True)

# making a DataFrame with actual and prob columns
df_test_predict = pd.concat([actual, test_predicted_prob], axis=1)
df_test_predict.columns = ['actual','prob']
df_test_predict.head()
Out[59]:
actual prob
0 0 0.026706
1 0 0.054805
2 1 0.982340
3 0 0.004979
4 0 0.052737
In [60]:
test_roc_like_df = pd.DataFrame()
test_temp = df_test_predict.copy()

for cut_off in np.linspace(0,1,50):
    test_temp['predicted'] = test_temp['prob'].apply(lambda x: 0 if x < cut_off else 1)
    test_temp['tp'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==1 else 0, axis=1)
    test_temp['fp'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==1 else 0, axis=1)
    test_temp['tn'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==0 else 0, axis=1)
    test_temp['fn'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==0 else 0, axis=1)
    sensitivity = test_temp['tp'].sum() / (test_temp['tp'].sum() + test_temp['fn'].sum())
    specificity = test_temp['tn'].sum() / (test_temp['tn'].sum() + test_temp['fp'].sum())
    
    accuracy=(test_temp['tp'].sum()+test_temp['tn'].sum()) / (test_temp['tp'].sum() + test_temp['fn'].sum()+test_temp['tn'].sum() + test_temp['fp'].sum())
    
    test_roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity,accuracy]).T
    test_roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity','accuracy']
    test_roc_like_df = pd.concat([test_roc_like_df, test_roc_like_table], axis=0)
In [61]:
test_roc_like_df.head()
Out[61]:
cutoff sensitivity specificity accuracy
0 0.000000 1.000000 0.000000 0.201862
0 0.020408 0.989556 0.446621 0.556219
0 0.040816 0.979983 0.623157 0.695186
0 0.061224 0.973020 0.692274 0.748946
0 0.081633 0.965187 0.730795 0.778110
In [62]:
test_temp.sum()
plt.subplots(figsize=(10,4))
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['sensitivity'], marker='*', label='Sensitivity')
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['specificity'], marker='*', label='Specificity')
#plt.scatter(test_roc_like_df['cutoff'], 1-test_roc_like_df['specificity'], marker='*', label='FPR')
plt.title('For each cutoff, pair of sensitivity and FPR is plotted for ROC')
plt.legend()
Out[62]:
<matplotlib.legend.Legend at 0x57747b8>
In [63]:
#Finding ideal cut-off for checking if this remains same in OOS validation
test_roc_like_df['total'] = test_roc_like_df['sensitivity'] + test_roc_like_df['accuracy']
test_roc_like_df[test_roc_like_df['total']==test_roc_like_df['total'].max()]
Out[63]:
cutoff sensitivity specificity accuracy total
0 0.122449 0.953003 0.766674 0.804287 1.757289
In [64]:
df_test_predict['predicted'] = df_test_predict['prob'].apply(lambda x: 1 if x > 0.122449 else 0)

import seaborn as sns
sns.heatmap(pd.crosstab(df_test_predict['actual'], df_test_predict['predicted']), annot=True, fmt='.0f')
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x57c62e8>
In [65]:
accuracy=metrics.accuracy_score(df_test_predict.actual, df_test_predict.predicted)
print('Accuracy: ',round(accuracy,2))
Accuracy:  0.8
In [66]:
from sklearn.metrics import classification_report
print(classification_report(df_test_predict.actual, df_test_predict.predicted))
             precision    recall  f1-score   support

          0       0.98      0.77      0.86      4543
          1       0.51      0.95      0.66      1149

avg / total       0.89      0.80      0.82      5692

Building Naive Bayes Model

In [67]:
from sklearn.naive_bayes import GaussianNB

nb_clf=GaussianNB()

nb_clf.fit(train_x,train_y)
Out[67]:
GaussianNB(priors=None)

Model Evelutation

In [68]:
#Predicting the test cases
from sklearn import metrics
test_accuracy=metrics.accuracy_score(test_y,nb_clf.predict(test_x))
print('test_accuracy: ',test_accuracy)

train_accuracy=metrics.accuracy_score(train_y,nb_clf.predict(train_x))
print('train_accuracy: ',train_accuracy)
test_accuracy:  0.808854532677442
train_accuracy:  0.8018221519463896
In [69]:
print('AUC train :',metrics.roc_auc_score(train_y,nb_clf.predict(train_x)))
print('AUC test :',metrics.roc_auc_score(test_y,nb_clf.predict(test_x)))
AUC train : 0.8365890688804225
AUC test : 0.8448192659371134
In [70]:
from sklearn.metrics import classification_report
print(classification_report(test_y,nb_clf.predict(test_x)))
             precision    recall  f1-score   support

          0       0.97      0.78      0.87      4543
          1       0.52      0.91      0.66      1149

avg / total       0.88      0.81      0.82      5692

In [71]:
# Creating a confusion matrix

from sklearn import metrics

cm = metrics.confusion_matrix(test_y,
                            nb_clf.predict(test_x), [0,1] )
cm

import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

sn.heatmap(cm, annot=True,  fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[71]:
Text(0.5, 15.0, 'Predicted label')

Apply Random forrest model

In [72]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d
In [73]:
param_grid={'n_estimators':[100, 200, 400, 600, 800]}

tree=GridSearchCV(RandomForestClassifier(oob_score=False,warm_start=True),param_grid,cv=5,n_jobs=-1)
tree.fit(train_x,train_y)
Out[73]:
GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0, warm_start=True),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [100, 200, 400, 600, 800]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
In [75]:
tree.best_params_
Out[75]:
{'n_estimators': 800}
In [76]:
radm_clf=RandomForestClassifier(oob_score=True,n_estimators=800,n_jobs=-1)
radm_clf.fit(train_x,train_y)
Out[76]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

Model Eveluation

In [77]:
#Predicting the test cases
from sklearn import metrics
test_accuracy=metrics.accuracy_score(test_y,radm_clf.predict(test_x))
print('test_accuracy: ',test_accuracy)

train_accuracy=metrics.accuracy_score(train_y,radm_clf.predict(train_x))
print('train_accuracy: ',train_accuracy)
test_accuracy:  0.9214687280393534
train_accuracy:  0.9704841502898878
In [78]:
print('AUC train :',metrics.roc_auc_score(train_y,radm_clf.predict(train_x)))
print('AUC test :',metrics.roc_auc_score(test_y,radm_clf.predict(test_x)))
AUC train : 0.924904214559387
AUC test : 0.8279150375667612
In [79]:
from sklearn.metrics import classification_report
print(classification_report(test_y,radm_clf.predict(test_x)))
             precision    recall  f1-score   support

          0       0.92      0.98      0.95      4543
          1       0.92      0.67      0.78      1149

avg / total       0.92      0.92      0.92      5692

In [80]:
# Creating a confusion matrix

from sklearn import metrics

cm = metrics.confusion_matrix(test_y,
                            radm_clf.predict(test_x), [0,1] )
cm

import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

sn.heatmap(cm, annot=True,  fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
Out[80]:
Text(0.5, 15.0, 'Predicted label')
In [81]:
test_predicted_prob=pd.DataFrame(radm_clf.predict_proba(test_x))[[1]]
test_predicted_prob.columns=['prob']
actual=test_y.reset_index()
actual.drop('index',axis=1,inplace=True)

# making a DataFrame with actual and prob columns
df_test_predict = pd.concat([actual, test_predicted_prob], axis=1)
df_test_predict.columns = ['actual','prob']
df_test_predict.head()
Out[81]:
actual prob
0 0 0.01125
1 0 0.05625
2 1 1.00000
3 0 0.00375
4 0 0.04500
In [82]:
test_roc_like_df = pd.DataFrame()
test_temp = df_test_predict.copy()

for cut_off in np.linspace(0,1,50):
    test_temp['predicted'] = test_temp['prob'].apply(lambda x: 0 if x < cut_off else 1)
    test_temp['tp'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==1 else 0, axis=1)
    test_temp['fp'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==1 else 0, axis=1)
    test_temp['tn'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==0 else 0, axis=1)
    test_temp['fn'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==0 else 0, axis=1)
    sensitivity = test_temp['tp'].sum() / (test_temp['tp'].sum() + test_temp['fn'].sum())
    specificity = test_temp['tn'].sum() / (test_temp['tn'].sum() + test_temp['fp'].sum())
    
    accuracy=(test_temp['tp'].sum()+test_temp['tn'].sum()) / (test_temp['tp'].sum() + test_temp['fn'].sum()+test_temp['tn'].sum() + test_temp['fp'].sum())
    
    test_roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity,accuracy]).T
    test_roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity','accuracy']
    test_roc_like_df = pd.concat([test_roc_like_df, test_roc_like_table], axis=0)
In [83]:
test_roc_like_df.head(5)
Out[83]:
cutoff sensitivity specificity accuracy
0 0.000000 1.000000 0.000000 0.201862
0 0.020408 0.998259 0.351530 0.482080
0 0.040816 0.994778 0.530046 0.623858
0 0.061224 0.988686 0.633502 0.705200
0 0.081633 0.981723 0.696676 0.754216
In [84]:
test_temp.sum()
plt.subplots(figsize=(10,4))
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['sensitivity'], marker='*', label='Sensitivity')
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['specificity'], marker='*', label='Specificity')
#plt.scatter(test_roc_like_df['cutoff'], 1-test_roc_like_df['specificity'], marker='*', label='FPR')
plt.title('For each cutoff, pair of sensitivity and FPR is plotted for ROC')
plt.legend()
Out[84]:
<matplotlib.legend.Legend at 0x1d754c88>
In [85]:
## Finding ideal cut-off for checking if this remains same in OOS validation
test_roc_like_df['total'] = test_roc_like_df['sensitivity'] + test_roc_like_df['specificity']
test_roc_like_df[test_roc_like_df['total']==test_roc_like_df['total'].max()]
Out[85]:
cutoff sensitivity specificity accuracy total
0 0.204082 0.964317 0.806295 0.838194 1.770612
In [86]:
df_test_predict['predicted'] = df_test_predict['prob'].apply(lambda x: 1 if x > 0.204082 else 0)

import seaborn as sns
sns.heatmap(pd.crosstab(df_test_predict['actual'], df_test_predict['predicted']), annot=True, fmt='.0f')
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d77c5c0>
In [87]:
accuracy=metrics.accuracy_score(df_test_predict.actual, df_test_predict.predicted)
print('Accuracy: ',round(accuracy,2))
Accuracy:  0.84
In [88]:
from sklearn.metrics import classification_report
print(classification_report(df_test_predict.actual, df_test_predict.predicted))
             precision    recall  f1-score   support

          0       0.99      0.81      0.89      4543
          1       0.56      0.96      0.71      1149

avg / total       0.90      0.84      0.85      5692

Text Clustring

In [89]:
train_x.shape
Out[89]:
(13281, 331)
In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
In [91]:
title=list2
title[0:5]
Out[91]:
['chief financial officer',
 'community connection intern pay internship',
 'country coordinator',
 'specialist',
 'software developer']
In [92]:
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 0.01, max_df = 0.9, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(title).toarray()
In [93]:
X.shape
Out[93]:
(16537, 48)

Cluster Error

In [94]:
cluster_range = range( 1, 21 )
cluster_errors = []

for num_clusters in cluster_range:
    clusters = KMeans( num_clusters )
    clusters.fit(X)
    cluster_errors.append( clusters.inertia_ )
In [95]:
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )

clusters_df[0:21]
Out[95]:
num_clusters cluster_errors
0 1 13631.991020
1 2 12320.649888
2 3 11474.188001
3 4 11009.692194
4 5 10027.822592
5 6 9546.478743
6 7 9287.663126
7 8 9088.867504
8 9 8578.701953
9 10 8489.231648
10 11 7976.736505
11 12 7826.614321
12 13 7686.765777
13 14 7231.006232
14 15 7074.110078
15 16 6739.365930
16 17 6550.441574
17 18 6427.778388
18 19 6124.412782
19 20 6047.016351
In [96]:
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
Out[96]:
[<matplotlib.lines.Line2D at 0x1f3e1940>]

Define Silhouette Coefficient

In [97]:
from sklearn import metrics
In [ ]:
# calculate SC for K=3 through K=12
k_range = range(2, 21)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X)
    scores.append(metrics.silhouette_score(X, km.labels_))
In [ ]:
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)
In [ ]:
true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
In [ ]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print
In [ ]:
 

Happy Learning.....